import pyforest
from math import sqrt
from xgboost import XGBRegressor
from yellowbrick.model_selection import LearningCurve
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import KFold
from yellowbrick.model_selection import cv_scores
from feature_engine import variable_transformers as vt
from feature_engine.discretisers import EqualFrequencyDiscretiser
from feature_engine.categorical_encoders import OrdinalCategoricalEncoder
from feature_engine.categorical_encoders import CountFrequencyCategoricalEncoder
from feature_engine.outlier_removers import Winsorizer
from feature_engine.categorical_encoders import RareLabelCategoricalEncoder
df = pd.read_excel('final.xlsx')
df["Month_Year"] = df['Year'].apply(str).str.cat(df['Month'], sep ="_")
date = pd.period_range('2019-01', periods=17, freq='M')
date
df['Month_Year'].value_counts()
seq = [583, 396, 445, 428, 401, 387, 456, 418, 449, 432, 444, 434, 358, 360, 371, 347, 316]
df['Final_Date'] = [item for item, count in zip(date, seq) for i in range(count)]
df = df.set_index('Final_Date')
Average Spend per month for respectice columns
df['Avg_pr_M'] = df.groupby('Final_Date')['Spend'].transform(lambda x : x.mean())
df['Avg_pr_M_pr_Tower'] = df.groupby(['Final_Date', 'Tower'])['Spend'].transform(lambda x : x.mean())
df['Avg_pr_M_pr_Category'] = df.groupby(['Final_Date', 'Category'])['Spend'].transform(lambda x : x.mean())
df['Avg_pr_M_pr_Country'] = df.groupby(['Final_Date', 'Country'])['Spend'].transform(lambda x : x.mean())
df['Avg_pr_M_pr_Region'] = df.groupby(['Final_Date', 'Region'])['Spend'].transform(lambda x : x.mean())
df['Avg_pr_M_pr_Vendor'] = df.groupby(['Final_Date', 'Supplier Name'])['Spend'].transform(lambda x : x.mean())
Finding the scaled value by dividing Total Average per month by average per month for the classes of a feature/column
df['Scaled_Tower'] = df['Avg_pr_M_pr_Tower']/df['Avg_pr_M']
df['Scaled_Catagory'] = df['Avg_pr_M_pr_Category']/df['Avg_pr_M']
df['Scaled_Country'] = df['Avg_pr_M_pr_Country']/df['Avg_pr_M']
df['Scaled_Region'] = df['Avg_pr_M_pr_Region']/df['Avg_pr_M']
df['Scaled_Vendor'] = df['Avg_pr_M_pr_Vendor']/df['Avg_pr_M']
df.drop(['Year','Month','Month_Year','Scaled_Catagory','Scaled_Region',
'Scaled_Vendor','Supplier Name'], inplace=True,axis=1)
df.head()
y_axis = average value of spending per month for each catagorie of Tower
x_axis = month (time)
g = sns.FacetGrid(df, col="Tower", col_wrap=3, height=3.75, ylim=(0, 400000))
g = g.map(sns.pointplot, "Month_Year", "Avg_pr_M_pr_Tower", color=".3")
g.set_xticklabels(labels=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17])
y_axis = average value of spending per month for each catagorie of Tower
x_axis = month (time)
g = sns.FacetGrid(df, col="Region", col_wrap=3, height=3.75, ylim=(0, 750000))
g = g.map(sns.pointplot, "Month_Year", "Avg_pr_M_pr_Region", color=".3")
g.set_xticklabels(labels=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17])
catagories = df.Category.unique()
for catagorie in catagories:
df1 = df[df['Category'] == catagorie]['Avg_pr_M_pr_Category']
df_cat0 = df1.groupby('Final_Date').mean()
df_cat0.plot(figsize=(15,8))
countries = df.Country.unique()
for country in countries:
df1 = df[df['Country'] == country]['Avg_pr_M_pr_Country']
df_cat0 = df1.groupby('Final_Date').mean()
df_cat0.plot(figsize=(15,8)).legend(countries)
From above plots, there are few common insights.
def plot_categories2(df, var):
fig, ax = plt.subplots(figsize=(8, 4))
plt.xticks(df.index, df[var], rotation=90)
ax2 = ax.twinx()
ax.bar(df.index, df["perc_data"], color='lightgrey')
ax2.plot(df.index, df["Spend"], color='green', label='Seconds')
ax.set_ylabel('percentage of spending per feature')
ax.set_xlabel(var)
ax2.set_ylabel('Average spending per feature')
plt.show()
for var in ['Tower','Category','Country','Region','Supplier Name']:
temp_df = calculate_mean_target_per_category(df, var)
plot_categories2(temp_df, var)
df.isnull().sum()
High cardinality leads to uneven distribution of categories in train and test sets
obj_df = df.select_dtypes(include=['object']).copy()
for column in obj_df.columns:
print('Feature: {} has Cardinality Value: {}'. format(column, len(df[column].unique())))
#print(df[column].unique())
print()
Rare values may cause overfitting in tree based algo
total = len(df)
# for each categorical variable
for col in obj_df:
temp_df = pd.Series(df[col].value_counts() / total)
fig = temp_df.sort_values(ascending=False).plot.bar()
fig.set_xlabel(col)
# add a line at 5 % to flag the threshold for rare categories
fig.axhline(y=0.05, color='red')
fig.set_ylabel('Percentage of data')
plt.show()
def calculate_mean_target_per_category(df, var):
total = len(df)
temp_df = pd.Series(df[var].value_counts() / total).reset_index()
temp_df.columns = [var, 'perc_data']
temp_df = temp_df.merge(df.groupby([var])['Spend'].mean().reset_index(),
on=var,
how='left')
return temp_df
temp_df = calculate_mean_target_per_category(df, 'Tower')
temp_df
def plot_categories(df, var):
fig, ax = plt.subplots(figsize=(8, 4))
plt.xticks(df.index, df[var], rotation=90)
ax2 = ax.twinx()
ax.bar(df.index, df["perc_data"], color='lightgrey')
ax2.plot(df.index, df["Spend"], color='green', label='Seconds')
ax.axhline(y=0.05, color='red')
ax.set_ylabel('percentage of spending per feature')
ax.set_xlabel(var)
ax2.set_ylabel('Average spending per feature')
plt.show()
plot_categories(temp_df, 'Tower')
This analysis is important because the catagories which are rare has higher spending, that means that they are important but we don't have enough data so that model will learn it properly.
temp_df = calculate_mean_target_per_category(df, 'Category')
plot_categories(temp_df, 'Category')
temp_df = calculate_mean_target_per_category(df, 'Country')
plot_categories(temp_df, 'Country')
temp_df = calculate_mean_target_per_category(df, 'Supplier Name')
plot_categories(temp_df, 'Supplier Name')
In this section we are study the linear relationship of target with other features which were created. Our conclusion from this is even after variable log transformation, linear relationship is not good.
sns.lmplot(x="Avg_pr_M_pr_Tower", y="Spend", data=df, order=1)
sns.lmplot(x="Avg_pr_M_pr_Category", y="Spend", data=df, order=1)
sns.lmplot(x="Avg_pr_M_pr_Country", y="Spend", data=df, order=1)
sns.lmplot(x="Avg_pr_M_pr_Region", y="Spend", data=df, order=1)
sns.lmplot(x="Avg_pr_M_pr_Vendor", y="Spend", data=df, order=1)
df['log_Avg_pr_M_pr_Tower'] = np.log(df['Avg_pr_M_pr_Tower'])
sns.lmplot(x="log_Avg_pr_M_pr_Tower", y="Spend", data=df, order=1)
df['log_Avg_pr_M_pr_Category'] = np.log(df['Avg_pr_M_pr_Category'])
sns.lmplot(x="log_Avg_pr_M_pr_Category", y="Spend", data=df, order=1)
Cont_df = df.select_dtypes(include=['int64', 'float']).copy()
correlation_matrix = Cont_df.corr().round(2)
figure = plt.figure(figsize=(12, 12))
sns.heatmap(data=correlation_matrix, annot=True)
sns.lmplot(x="Scaled_Country", y="Avg_pr_M_pr_Country", data=df, order=1)
sns.lmplot(x="Scaled_Vendor", y="Avg_pr_M_pr_Vendor", data=df, order=1)
sns.lmplot(x="Scaled_Tower", y="Avg_pr_M_pr_Tower", data=df, order=1)
sns.lmplot(x="Scaled_Catagory", y="Avg_pr_M_pr_Category", data=df, order=1)
So our beleif of creating new scaled feature which could be used as additional feature to capture some amount of information can not be used as they have high correlation with their average values. We need to remove them.
We study the presense of outlier using interquantile proximity rule
import scipy.stats as stats
def diagnostic_plots(df, variable):
# define figure size
plt.figure(figsize=(16, 4))
# histogram
plt.subplot(1, 3, 1)
sns.distplot(df[variable], bins=30)
plt.title('Histogram')
# Q-Q plot
plt.subplot(1, 3, 2)
stats.probplot(df[variable], dist="norm", plot=plt)
plt.ylabel('RM quantiles')
# boxplot
plt.subplot(1, 3, 3)
sns.boxplot(y=df[variable])
plt.title('Boxplot')
plt.show()
for column in ['Avg_pr_M', 'Avg_pr_M_pr_Tower', 'Avg_pr_M_pr_Category','Avg_pr_M_pr_Country',
'Avg_pr_M_pr_Region', 'Avg_pr_M_pr_Vendor', 'Spend']:
print('The feature is {}:'.format(column))
diagnostic_plots(df, column)
# function to find upper and lower boundaries
# for skewed distributed variables
def find_skewed_boundaries(df, variable, distance):
# Calculating the boundaries outside which sit the outliers
# for skewed distributions
# distance passed as an argument, gives us the option to
# estimate 1.5 times or 3 times the IQR to calculate
# the boundaries.
IQR = df[variable].quantile(0.75) - df[variable].quantile(0.25)
lower_boundary = df[variable].quantile(0.25) - (IQR * distance)
upper_boundary = df[variable].quantile(0.75) + (IQR * distance)
return upper_boundary, lower_boundary
# looking for outliers,
# using the interquantile proximity rule
# IQR * 1.5, the standard metric
print('total data: {}'.format(len(df)))
print()
for feature in ['Avg_pr_M_pr_Category','Avg_pr_M_pr_Vendor','Spend', 'Avg_pr_M_pr_Region','Avg_pr_M_pr_Tower']:
upper_boundary, lower_boundary = find_skewed_boundaries(df, feature, 1.5)
print('{}: upper limit & {}: lower limit'.format(upper_boundary, lower_boundary) )
print('data with value bigger than {}: {}'.format(upper_boundary, len(df[df[feature] > upper_boundary])))
print('% data with value bigger than {}: {}'.format(upper_boundary, len(df[df[feature] > upper_boundary])/len(df)))
print()
We are planning to go ahead with tree based algorithm, hence feature scalling is not important.
def diagnostic_plots(df, variable):
# function to plot a histogram and a Q-Q plot
# side by side, for a certain variable
plt.figure(figsize=(15,6))
plt.subplot(1, 2, 1)
df[variable].hist()
plt.subplot(1, 2, 2)
stats.probplot(df[variable], dist="norm", plot=plt)
plt.show()
# Transforming Variable
lt1 = vt.BoxCoxTransformer(variables = ['Avg_pr_M','Avg_pr_M_pr_Tower','Avg_pr_M_pr_Category','Avg_pr_M_pr_Country','Avg_pr_M_pr_Region','Avg_pr_M_pr_Vendor'])
lt1.fit(df)
data_tf1 = lt1.transform(df)
lt2 = vt.YeoJohnsonTransformer(variables = ['Avg_pr_M','Avg_pr_M_pr_Tower','Avg_pr_M_pr_Category','Avg_pr_M_pr_Country','Avg_pr_M_pr_Region','Avg_pr_M_pr_Vendor'])
lt2.fit(df)
data_tf2 = lt2.transform(df)
# plotting
for variable in ['Avg_pr_M', 'Avg_pr_M_pr_Tower', 'Avg_pr_M_pr_Category','Avg_pr_M_pr_Country',
'Avg_pr_M_pr_Region', 'Avg_pr_M_pr_Vendor']:
print('Plot for {}'.format(variable))
diagnostic_plots(df, variable)
print('BoxCox Transformed Plot for {}'.format(variable))
diagnostic_plots(data_tf1, variable)
print('Yeo Johnson Transformed Plot for {}'.format(variable))
diagnostic_plots(data_tf2, variable)
diagnostic_plots(data_tf, 'Avg_pr_M')
Discretisation is the process of transforming continuous variables into discrete variables by creating a set of contiguous intervals that span the range of the variable's values.
Discretisation helps handle outliers and may improve value spread in skewed variables.
Equal frequency discretisation divides the scope of possible values of the variable into N bins, where each bin carries the same amount of observations. This is a Unsupervised discretisation methods.
variable = ['Scaled_Tower','Scaled_Catagory','Scaled_Country', 'Scaled_Region','Scaled_Vendor']
X_train, X_test, y_train, y_test = train_test_split(df[variable], df['Spend'],
test_size=0.3, random_state=0)
X_train.shape, X_test.shape
disc = EqualFrequencyDiscretiser(q=10, variables=variable, return_object=True)
# find the intervals
disc.fit(X_train)
# transform train and text
train_t = disc.transform(X_train)
test_t = disc.transform(X_test)
train_t.dtypes
pd.concat([train_t, y_train], axis=1).groupby('Scaled_Tower')['Spend'].mean().plot()
plt.ylabel('mean of Scaled_Tower')
pd.concat([train_t, y_train], axis=1).groupby('Scaled_Catagory')['Spend'].mean().plot()
plt.ylabel('mean of Scaled_Catagory')
pd.concat([train_t, y_train], axis=1).groupby('Scaled_Country')['Spend'].mean().plot()
plt.ylabel('mean of Scaled_Country')
pd.concat([train_t, y_train], axis=1).groupby('Scaled_Catagory')['Spend'].mean().plot()
plt.ylabel('mean of Scaled_Category')
pd.concat([train_t, y_train], axis=1).groupby('Scaled_Vendor')['Spend'].mean().plot()
plt.ylabel('mean of Scaled_Vendor')
The variables don't show a clear linear relationship with target variable after discretization. One way to solve this is by encoding.
enc = OrdinalCategoricalEncoder(encoding_method = 'ordered')
enc.fit(train_t, y_train)
train_t = enc.transform(train_t)
test_t = enc.transform(test_t)
enc.encoder_dict_
pd.concat([train_t, y_train], axis=1).groupby('Scaled_Tower')['Spend'].mean().plot()
plt.ylabel('mean of Scaled_Tower')
pd.concat([train_t, y_train], axis=1).groupby('Scaled_Catagory')['Spend'].mean().plot()
plt.ylabel('mean of Scaled_Catagory')
pd.concat([train_t, y_train], axis=1).groupby('Scaled_Country')['Spend'].mean().plot()
plt.ylabel('mean of Scaled_Country')
pd.concat([train_t, y_train], axis=1).groupby('Scaled_Catagory')['Spend'].mean().plot()
plt.ylabel('mean of Scaled_Category')
pd.concat([train_t, y_train], axis=1).groupby('Scaled_Vendor')['Spend'].mean().plot()
plt.ylabel('mean of Scaled_Vendor')
From above analysis we can say that, even after binning and encoding few of the variables are not having monotonic relationship with target variable. Hece we should remove them. The variable to be used after binning are:
cols = ['Supplier Name', 'Region', 'Country', 'Tower', 'Category']
X_train, X_test, y_train, y_test = train_test_split(df[cols], df['Spend'],
test_size=0.3, random_state=0)
X_train.shape, X_test.shape
for col in cols:
print(X_train.groupby(col)[col].count() / len(X_train)) # frequency
print()
for col in ['Region', 'Country', 'Tower', 'Category']:
temp_df = pd.Series(X_train[col].value_counts() / len(X_train) )
# make plot with the above percentages
fig = temp_df.sort_values(ascending=False).plot.bar()
fig.set_xlabel(col)
# add a line at 5 % to flag the threshold for rare categories
fig.axhline(y=0.05, color='red')
fig.set_ylabel('Percentage of houses')
plt.show()
def find_non_rare_labels(df, variable, tolerance):
temp = df.groupby([variable])[variable].count() / len(df)
non_rare = [x for x in temp.loc[temp>tolerance].index.values]
return non_rare
def rare_encoding(X_train, X_test, variable, tolerance):
X_train = X_train.copy()
X_test = X_test.copy()
# find the most frequent category
frequent_cat = find_non_rare_labels(X_train, variable, tolerance)
# re-group rare labels
X_train[variable] = np.where(X_train[variable].isin(frequent_cat), X_train[variable], 'Rare')
X_test[variable] = np.where(X_test[variable].isin(frequent_cat), X_test[variable], 'Rare')
return X_train, X_test
for variable in ['Region', 'Country', 'Tower', 'Category']:
X_train, X_test = rare_encoding(X_train, X_test, variable, 0.05)
rare_encoder = RareLabelCategoricalEncoder(
tol=0.05, # minimal percentage to be considered non-rare
n_categories=4, # minimal number of categories the variable should have to re-cgroup rare categories
variables=['Region','Country','Tower','Category'] # variables to re-group
)
rare_encoder.fit(X_train.fillna('Missing'))
X_train = rare_encoder.transform(X_train.fillna('Missing'))
X_test = rare_encoder.transform(X_test.fillna('Missing'))
for col in ['Region', 'Country', 'Tower', 'Category']:
temp_df = pd.Series(X_train[col].value_counts() / len(X_train) )
# make plot with the above percentages
fig = temp_df.sort_values(ascending=False).plot.bar()
fig.set_xlabel(col)
# add a line at 5 % to flag the threshold for rare categories
fig.axhline(y=0.05, color='red')
fig.set_ylabel('Percentage of houses')
plt.show()
Going ahead with Frequency Encoding. Reason for this is, since we are planning for tree based algo, one hot encoding will not work. In addition frequency encoding adds additional information into data. It also doenot increase feature space.
count_enc = CountFrequencyCategoricalEncoder(encoding_method='count',
variables=['Region', 'Country',
'Tower', 'Category'])
count_enc.fit(X_train)
count_enc.encoder_dict_
X_train = count_enc.transform(X_train)
X_test = count_enc.transform(X_test)
# Looking at the result
X_train.head()
# Running diagonistic plot on transformed variables using box cox transformation i.e. data_tf1
# from step 1.0.5.1
for column in ['Avg_pr_M', 'Avg_pr_M_pr_Tower', 'Avg_pr_M_pr_Category','Avg_pr_M_pr_Country',
'Avg_pr_M_pr_Region', 'Avg_pr_M_pr_Vendor', 'Spend']:
print('The feature is {}:'.format(column))
diagnostic_plots(data_tf1, column)
Winsorization is a way to minimize the influence of outliers in your data by either:
1. Assigning the outlier a lower weight,
2. Changing the value so that it is close to other values in the set.
windsoriser = Winsorizer(distribution='skewed', # choose skewed for IQR rule boundaries or gaussian for mean and std
tail='both', # cap left, right or both tails
fold=1.5,
variables=['Avg_pr_M','Avg_pr_M_pr_Tower','Avg_pr_M_pr_Category',
'Avg_pr_M_pr_Country','Avg_pr_M_pr_Region','Avg_pr_M_pr_Vendor'])
windsoriser.fit(data_tf1)
data_tf1_ROut = windsoriser.transform(data_tf1)
for column in ['Avg_pr_M', 'Avg_pr_M_pr_Tower', 'Avg_pr_M_pr_Category','Avg_pr_M_pr_Country',
'Avg_pr_M_pr_Region', 'Avg_pr_M_pr_Vendor']:
print('The feature is {}:'.format(column))
diagnostic_plots(data_tf1_ROut, column)
1. Building end-to-end pipeline which include both hyper parameter tunning for prediction algorithm & selecting best parameter for Feature Engineering.
df.head(4)
sample = pd.read_excel('sample2.xlsx', date_parser=True)
sample.drop(['Final_Date', 'Supplier Name'], axis=1, inplace = True)
sample['Spend'] = 0
sample.head(2)
lt1 = vt.BoxCoxTransformer(variables = ['Avg_pr_M','Avg_pr_M_pr_Tower','Avg_pr_M_pr_Category',
'Avg_pr_M_pr_Country','Avg_pr_M_pr_Region',
'Avg_pr_M_pr_Vendor'])
lt1.fit(df)
df = lt1.transform(df)
sample = lt1.transform(sample)
lt2 = vt.YeoJohnsonTransformer(variables = ['Avg_pr_M','Avg_pr_M_pr_Tower','Avg_pr_M_pr_Category',
'Avg_pr_M_pr_Country','Avg_pr_M_pr_Region',
'Avg_pr_M_pr_Vendor'])
lt2.fit(df)
df_2 = lt2.transform(df)
sample = lt2.transform(sample)
df_2.head(2)
X_train, X_test, y_train, y_test = train_test_split(
df_2.drop(['Spend'],axis=1), # predictors
df_2['Spend'], # target
test_size=0.2, # percentage of obs in test set
random_state=0) # seed to ensure reproducibility
X_train.shape, X_test.shape
DEF_pipe = Pipeline([
# discretization & ordinal encoding
('discrit',
EqualFrequencyDiscretiser(q=10, variables=['Scaled_Tower','Scaled_Country'], return_object = True)),
('ord_enc',
OrdinalCategoricalEncoder(encoding_method = 'arbitrary', variables=['Scaled_Tower','Scaled_Country'])),
# Rare Label Encoding and Catagorical(frequency) Encoding
('encoder_rare_label',
RareLabelCategoricalEncoder(n_categories=4, variables=['Region','Country','Tower','Category'], replace_with='Rare')),
('categorical_encoder',
CountFrequencyCategoricalEncoder(encoding_method='count', variables=['Region','Country','Tower','Category'])),
# Outlier Treatment
('winsorization',
Winsorizer(distribution='skewed', tail='both', fold=1.5,
variables=['Avg_pr_M','Avg_pr_M_pr_Tower','Avg_pr_M_pr_Category',
'Avg_pr_M_pr_Country','Avg_pr_M_pr_Region','Avg_pr_M_pr_Vendor'])),
# Algorithm
('reg', Lasso())
])
param_grid = {
# trying different feature engineering parameters
'discrit__q': [10, 15],
'encoder_rare_label__tol': [0.02, 0.04, 0.05],
# trying different gradient boosted tree model paramenters
'reg__alpha': [0.1,0.3,0.5,0.8,1]
}
grid_search = GridSearchCV(DEF_pipe, param_grid, cv=5, n_jobs=-1, scoring='r2')
grid_search.fit(X_train, y_train)
print(("best r2 from grid search: %.3f" % grid_search.score(X_train, y_train)))
grid_search.best_estimator_
DEF_pipe = Pipeline([
# discretization & ordinal encoding
('discrit',
EqualFrequencyDiscretiser(q=10, variables=['Scaled_Tower','Scaled_Country'], return_object = True)),
('ord_enc',
OrdinalCategoricalEncoder(encoding_method = 'arbitrary', variables=['Scaled_Tower','Scaled_Country'])),
# Rare Label Encoding and Catagorical(frequency) Encoding
('encoder_rare_label',
RareLabelCategoricalEncoder(n_categories=4, variables=['Region','Country','Tower','Category'], replace_with='Rare')),
('categorical_encoder',
CountFrequencyCategoricalEncoder(encoding_method='count', variables=['Region','Country','Tower','Category'])),
# Outlier Treatment
('winsorization',
Winsorizer(distribution='skewed', tail='both', fold=1.5,
variables=['Avg_pr_M','Avg_pr_M_pr_Tower','Avg_pr_M_pr_Category',
'Avg_pr_M_pr_Country','Avg_pr_M_pr_Region','Avg_pr_M_pr_Vendor'])),
# Algorithm
('gbm', GradientBoostingRegressor(learning_rate=0.01, random_state=10))
])
param_grid = {
# trying different feature engineering parameters
'discrit__q': [10, 15],
'encoder_rare_label__tol': [0.02, 0.04, 0.05],
# trying different gradient boosted tree model paramenters
'gbm__learning_rate': [0.1,0.5], 'gbm__n_estimators': [200,400,500]
}
grid_search = GridSearchCV(DEF_pipe, param_grid, cv=5, n_jobs=-1, scoring='r2')
grid_search.fit(X_train, y_train)
# we print the best score over the train set
print(("best r2 from grid search: %.3f" % grid_search.score(X_train, y_train)))
# we can print the best estimator parameters like this
grid_search.best_estimator_
prediction(X_train, X_test)
DEF_pipe = Pipeline([
# discretization & ordinal encoding
('discrit',
EqualFrequencyDiscretiser(q=10, variables=['Scaled_Tower','Scaled_Country'], return_object = True)),
('ord_enc',
OrdinalCategoricalEncoder(encoding_method = 'arbitrary', variables=['Scaled_Tower','Scaled_Country'])),
# Rare Label Encoding and Catagorical(frequency) Encoding
('encoder_rare_label',
RareLabelCategoricalEncoder(n_categories=4, variables=['Region','Country','Tower','Category'], replace_with='Rare')),
('categorical_encoder',
CountFrequencyCategoricalEncoder(encoding_method='count', variables=['Region','Country','Tower','Category'])),
# Outlier Treatment
('winsorization',
Winsorizer(distribution='skewed', tail='both', fold=1.5,
variables=['Avg_pr_M','Avg_pr_M_pr_Tower','Avg_pr_M_pr_Category',
'Avg_pr_M_pr_Country','Avg_pr_M_pr_Region','Avg_pr_M_pr_Vendor'])),
# Algorithm
('reg', RandomForestRegressor())
])
param_grid = {
# trying different feature engineering parameters
'discrit__q': [10, 15],
'encoder_rare_label__tol': [0.02, 0.04, 0.05],
# trying different gradient boosted tree model paramenters
'reg__n_estimators': [100,200,400,500], 'reg__random_state': [10]
}
grid_search = GridSearchCV(DEF_pipe, param_grid, cv=5, n_jobs=-1, scoring='r2')
grid_search.fit(X_train, y_train)
print(("best r2 from grid search: %.3f" % grid_search.score(X_train, y_train)))
grid_search.best_estimator_
prediction(X_train, X_test)
DEF_pipe = Pipeline([
# discretization & ordinal encoding
('discrit',
EqualFrequencyDiscretiser(q=10, variables=['Scaled_Tower','Scaled_Country'], return_object = True)),
('ord_enc',
OrdinalCategoricalEncoder(encoding_method = 'arbitrary', variables=['Scaled_Tower','Scaled_Country'])),
# Rare Label Encoding and Catagorical(frequency) Encoding
('encoder_rare_label',
RareLabelCategoricalEncoder(n_categories=4, variables=['Region','Country','Tower','Category'], replace_with='Rare')),
('categorical_encoder',
CountFrequencyCategoricalEncoder(encoding_method='count', variables=['Region','Country','Tower','Category'])),
# Outlier Treatment
('winsorization',
Winsorizer(distribution='skewed', tail='both', fold=1.5,
variables=['Avg_pr_M','Avg_pr_M_pr_Tower','Avg_pr_M_pr_Category',
'Avg_pr_M_pr_Country','Avg_pr_M_pr_Region','Avg_pr_M_pr_Vendor'])),
# Algorithm
('reg', XGBRegressor())
])
param_grid = {
# trying different feature engineering parameters
'discrit__q': [10, 15],
'encoder_rare_label__tol': [0.02, 0.04, 0.05],
# trying different gradient boosted tree model paramenters
'reg__n_estimators': [100,200,400,500], 'reg__random_state': [10], 'reg__learning_rate': [0.01,0.1,0.5,1],
'reg__n_gamma': [0,0.1]
}
grid_search = GridSearchCV(DEF_pipe, param_grid, cv=5, n_jobs=-1, scoring='r2')
grid_search.fit(X_train, y_train)
print(("best r2 from grid search: %.3f" % grid_search.score(X_train, y_train)))
grid_search.best_estimator_
prediction(X_train, X_test)
def prediction(X_train, X_test):
# Prediction Using GridSearch to make use of Cross Validation Approach
X_train_preds = grid_search.predict(X_train)
X_test_preds = grid_search.predict(X_test)
print('train mse: {}'.format(mean_squared_error(y_train, X_train_preds)))
print('train rmse: {}'.format(sqrt(mean_squared_error(y_train, X_train_preds))))
print('train r2: {}'.format(r2_score(y_train, X_train_preds)))
print()
print('test mse: {}'.format(mean_squared_error(y_test, X_test_preds)))
print('test rmse: {}'.format(sqrt(mean_squared_error(y_test, X_test_preds))))
print('test r2: {}'.format(r2_score(y_test, X_test_preds)))
sample.drop(['Spend'], axis=1, inplace=True)
disc = EqualFrequencyDiscretiser(q=15, variables=['Scaled_Tower','Scaled_Country'], return_object = True)
# find the intervals
disc.fit(X_train)
# transform train and text
train_t = disc.transform(X_train)
test_t = disc.transform(X_test)
sample_t = disc.transform(sample)
ce = OrdinalCategoricalEncoder(encoding_method = 'ordered', variables=['Scaled_Tower','Scaled_Country'])
ce.fit(train_t, y_train)
# transform train and text
train_t = ce.transform(train_t)
test_t = ce.transform(test_t)
sample_t = ce.transform(sample_t)
rar = RareLabelCategoricalEncoder(tol=0.02, n_categories=4, variables=['Region','Country','Tower','Category'], replace_with='Rare')
rar.fit(train_t)
# transform train and text
train_t = rar.transform(train_t)
test_t = rar.transform(test_t)
sample_t = rar.transform(sample_t)
'''fre = CountFrequencyCategoricalEncoder(encoding_method='count', variables=['Region','Country','Tower','Category'])
fre.fit(train_t)
# transform train and text
train_t = fre.transform(train_t)
test_t = fre.transform(test_t)'''
OR
ordinal_enc = OrdinalCategoricalEncoder(encoding_method='arbitrary',
variables=['Region','Country','Tower','Category'])
ordinal_enc.fit(train_t)
# transform train and text
train_t = ordinal_enc.transform(train_t)
test_t = ordinal_enc.transform(test_t)
sample_t = ordinal_enc.transform(sample_t)
win = Winsorizer(distribution='skewed', tail='both', fold=1.5,
variables=['Avg_pr_M','Avg_pr_M_pr_Tower','Avg_pr_M_pr_Category',
'Avg_pr_M_pr_Country','Avg_pr_M_pr_Region','Avg_pr_M_pr_Vendor'])
win.fit(train_t)
# transform train and text
train_t = win.transform(train_t)
test_t = win.transform(test_t)
sample_t = win.transform(sample_t)
model = XGBRegressor(random_state=10, n_estimators=500)
model.fit(train_t, y_train)
X_train_preds = model.predict(train_t)
X_test_preds = model.predict(test_t)
sample_preds = model.predict(sample_t)
print('train mse: {}'.format(mean_squared_error(y_train, X_train_preds)))
print('train rmse: {}'.format(sqrt(mean_squared_error(y_train, X_train_preds))))
print('train r2: {}'.format(r2_score(y_train, X_train_preds)))
print()
print('test mse: {}'.format(mean_squared_error(y_test, X_test_preds)))
print('test rmse: {}'.format(sqrt(mean_squared_error(y_test, X_test_preds))))
print('test r2: {}'.format(r2_score(y_test, X_test_preds)))
sample['Spend'] = sample_preds
sample.to_excel('forecasting.xlsx')
from yellowbrick.regressor import ResidualsPlot
# Instantiate the linear model and visualizer
visualizer = ResidualsPlot(model, test_alpha=0.50, train_alpha=0.5)
visualizer.fit(train_t, y_train) # Fit the training data to the visualizer
visualizer.score(test_t, y_test) # Evaluate the model on the test data
visualizer.show()
models = [LinearRegression(), Lasso(), RandomForestRegressor(), GradientBoostingRegressor(), XGBRegressor()]
cv = KFold(n_splits=12, random_state=42)
scores = ['r2']
for model in models:
for score in scores:
visualizer = cv_scores(model, train_t, y_train, cv=cv, scoring=score)
# XGBoostRegressor
visualizer = LearningCurve(model, scoring='r2')
visualizer.fit(train_t, y_train)
visualizer.show()
# RandomForest
visualizer = LearningCurve(model, scoring='r2')
visualizer.fit(train_t, y_train)
visualizer.show()
# Gradientboosting Regressor
visualizer = LearningCurve(model, scoring='r2')
visualizer.fit(train_t, y_train)
visualizer.show()
# will return a category if all 17 month data is present
# we can build seperate models on each
def convert(col_name, cat, df_temp, Avg_Value_Column):
date = np.array(['2019-01-01', '2019-02-01', '2019-03-01', '2019-04-01', '2019-05-01', '2019-06-01',
'2019-07-01', '2019-08-01', '2019-09-01', '2019-10-01', '2019-11-01', '2019-12-01',
'2020-01-01', '2020-02-01', '2020-03-01', '2020-04-01', '2020-05-01'], dtype='datetime64[M]')
idx = pd.DatetimeIndex(date)
'''
The parameters to be passed are:
df_temp: Name of the data frame
col_name: Column name
cat: A class of the column for which we want a time series dataframe to be created.
Avg_Value_Column: Average value of the class of any column per month. This is calculated while creatng data.
'''
df_temp = df[df[col_name]==cat].groupby('Final_Date')[Avg_Value_Column].mean().to_frame()
df_temp['Date'] = idx
df_temp.set_index('Date',inplace=True)
if(len(df_temp) == 17):
print('Length is {}'.format(len(df_temp)))
return df_temp
else:
print('Not enough data for {}:'.format(cat))
sample1 = convert('Region', 'APAC', df, 'Avg_pr_M_pr_Region')
#print(sample)
sample1.plot()
sample2 = convert('Region', 'Americas', df, 'Avg_pr_M_pr_Region')
#print(sample)
sample2.plot()
sample3 = convert('Region', 'Europe', df, 'Avg_pr_M_pr_Region')
#print(sample)
sample3.plot()
df = pd.read_excel('final2.xlsx', parse_dates=True)
rar = RareLabelCategoricalEncoder(tol=0.02, n_categories=4, variables=['Region','Country','Tower','Category'],
replace_with='Rare')
df = rar.fit_transform(df)
df_test = df.set_index(['Tower', 'Month'])
df_test[0:10]